Goals
Getting Started
Load some usual packages and run the following code which defines a shaded_normal() function which is specialized to this activity alone:
# Load packages
library(dplyr)
library(ggplot2)
shaded_normal <- function(mean, sd, a = NULL, b = NULL){
min_x <- mean - 4*sd
max_x <- mean + 4*sd
a <- max(a, min_x)
b <- min(b, max_x)
ggplot() +
#xlim(min_x, max_x) +
scale_x_continuous(limits = c(min_x, max_x), breaks = c(mean - sd*(0:3), mean + sd*(1:3))) +
stat_function(fun = dnorm, args = list(mean = mean, sd = sd)) +
stat_function(geom = "area", fun = dnorm, args = list(mean = mean, sd = sd), xlim = c(a, b), fill = "blue") +
labs(y = "density")
}
Github user Tony McGovern has compiled and made available presidential election results for the population of all 3000+ U.S. counties (except Alaska). (Image: Wikimedia Commons)
Import the combined and slightly wrangled data:
# Import & wrangle data
elections <- read.csv("https://www.macalester.edu/~ajohns24/data/election_2020.csv") %>%
mutate(per_gop_2020 = 100*per_gop_2020,
per_gop_2016 = 100*per_gop_2016, per_gop_2012 = 100*per_gop_2012)
The Republican (or “GOP”) candidate for president was Donald Trump in 2020 and Mitt Romney in 2012. Our goal will be to understand how Trump’s 2020 vote percentage (per_gop_2020) relates to Romney’s 2012 vote percentage (per_gop_2012). BUT let’s pretend that we’re working within the typical scenario - we don’t have access to the entire population of interest. Instead, we need to estimate population trends using data from a randomly selected sample of counties.
We’ll be taking some random samples of counties throughout this activity. The underlying random number generator plays a role in the random sample we happen to get:
# Try the following chunk A FEW TIMES
sample_n(elections, size = 2, replace = FALSE)
## state_name state_abbr county_name county_fips votes_gop_2020
## 1 Virginia VA Fauquier County 51061 25106
## 2 Mississippi MS Calhoun County 28013 4625
## votes_dem_2020 total_votes_2020 per_gop_2020 per_dem_2020 per_point_diff_2020
## 1 17565 43661 57.50212 0.4023041 0.1727171
## 2 1902 6595 70.12889 0.2884003 0.4128886
## votes_dem_2016 votes_gop_2016 total_votes_2016 per_dem_2016 per_gop_2016
## 1 12960 22110 37124 0.3491003 59.55716
## 2 1884 4360 6339 0.2972078 68.78056
## per_point_diff_2016 total_votes_2012 votes_dem_2012 votes_gop_2012
## 1 -0.2464713 35339 13891 20955
## 2 -0.3905979 6273 2250 3961
## per_dem_2012 per_gop_2012 per_point_diff_2012
## 1 0.3930785 59.29709 -0.1998925
## 2 0.3586801 63.14363 -0.2727563
# Try the following FULL chunk A FEW TIMES
set.seed(155)
sample_n(elections, size = 2, replace = FALSE)
## state_name state_abbr county_name county_fips votes_gop_2020
## 1 Texas TX Baylor County 48023 1494
## 2 Indiana IN Dearborn County 18029 19528
## votes_dem_2020 total_votes_2020 per_gop_2020 per_dem_2020 per_point_diff_2020
## 1 183 1702 87.77908 0.1075206 0.7702703
## 2 5446 25383 76.93338 0.2145530 0.5547808
## votes_dem_2016 votes_gop_2016 total_votes_2016 per_dem_2016 per_gop_2016
## 1 191 1267 1492 0.1280161 84.91957
## 2 4883 18110 23910 0.2042242 75.74237
## per_point_diff_2016 total_votes_2012 votes_dem_2012 votes_gop_2012
## 1 -0.7211796 1592 267 1297
## 2 -0.5531995 22352 6527 15391
## per_dem_2012 per_gop_2012 per_point_diff_2012
## 1 0.1677136 81.46985 -0.6469849
## 2 0.2920097 68.85737 -0.3965641
NOTE: If we set.seed(some positive integer) before taking a random sample, we’ll get the same results. This reproducibility is important:
set.seed(818934757)
my_sample <- sample_n(elections, size = 10, replace = FALSE)
my_sample
## state_name state_abbr county_name county_fips votes_gop_2020
## 1 Missouri MO Schuyler County 29197 1606
## 2 New York NY Clinton County 36019 16514
## 3 North Dakota ND Billings County 38007 541
## 4 Massachusetts MA Norfolk County 25021 125294
## 5 Georgia GA Emanuel County 13107 6553
## 6 Montana MT Cascade County 30013 23315
## 7 Tennessee TN Bledsoe County 47007 4725
## 8 North Dakota ND Pierce County 38069 1585
## 9 Oklahoma OK Johnston County 40069 3441
## 10 Alabama AL Perry County 1105 1339
## votes_dem_2020 total_votes_2020 per_gop_2020 per_dem_2020
## 1 373 2003 80.17973 0.1862207
## 2 18364 35437 46.60101 0.5182154
## 3 72 635 85.19685 0.1133858
## 4 273312 407751 30.72807 0.6702914
## 5 2886 9505 68.94266 0.3036297
## 6 15456 39885 58.45556 0.3875141
## 7 971 5758 82.05974 0.1686349
## 8 497 2128 74.48308 0.2335526
## 9 738 4251 80.94566 0.1736062
## 10 3860 5230 25.60229 0.7380497
## per_point_diff_2020 votes_dem_2016 votes_gop_2016 total_votes_2016
## 1 0.61557664 354 1505 1933
## 2 -0.05220532 13446 13181 28378
## 3 0.73858268 58 492 592
## 4 -0.36301076 219129 119171 357781
## 5 0.38579695 2433 5330 7838
## 6 0.19704149 12053 19343 33791
## 7 0.65196249 896 3621 4650
## 8 0.51127820 431 1433 2020
## 9 0.63585039 782 3081 4001
## 10 -0.48202677 3823 1403 5255
## per_dem_2016 per_gop_2016 per_point_diff_2016 total_votes_2012
## 1 0.18313502 77.85825 -0.59544749 1939
## 2 0.47381775 46.44795 0.00933822 27622
## 3 0.09797297 83.10811 -0.73310811 577
## 4 0.61246684 33.30836 0.27938320 350195
## 5 0.31041082 68.00204 -0.36960959 8060
## 6 0.35669261 57.24305 -0.21573792 33952
## 7 0.19268817 77.87097 -0.58602151 4359
## 8 0.21336634 70.94059 -0.49603960 2162
## 9 0.19545114 77.00575 -0.57460635 3778
## 10 0.72749762 26.69838 0.46051380 6065
## votes_dem_2012 votes_gop_2012 per_dem_2012 per_gop_2012 per_point_diff_2012
## 1 697 1174 0.3594636 60.54667 -0.24600309
## 2 17123 10054 0.6199044 36.39852 0.25591919
## 3 89 472 0.1542461 81.80243 -0.66377816
## 4 200891 144654 0.5736547 41.30670 0.16058767
## 5 2919 5086 0.3621588 63.10174 -0.26885856
## 6 14949 18039 0.4402981 53.13089 -0.09101084
## 7 1267 3022 0.2906630 69.32783 -0.40261528
## 8 656 1462 0.3034228 67.62257 -0.37280296
## 9 1134 2644 0.3001588 69.98412 -0.39968237
## 10 4536 1504 0.7478978 24.79802 0.49991756
my_sample %>%
summarize(mean(per_gop_2020))
## mean(per_gop_2020)
## 1 63.31947
ggplot(my_sample, aes(x = per_gop_2012, y = per_gop_2020)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
my_model <- lm(per_gop_2020 ~ per_gop_2012, my_sample)
summary(my_model)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.226421 8.796304 -0.3667928 0.7232855947
## per_gop_2012 1.171542 0.148504 7.8889629 0.0000482836
Indicate your sample mean, intercept, and slope estimates in this survey. You’ll come back to analyze this experiment data later in the activity!
In this section you will practice the concepts you explored in today’s video (slides)
Suppose that the speeds of cars on a highway, in miles per hour, can be reasonably represented by the following model: N(55, 52).
\[55 ~\&~ 2\]
# once you’re ready to run the code.)shaded_normal(mean = 55, sd = 2)
a is the lower end of the range and b is the upper end.shaded_normal(mean = 55, sd = 2, a = 53, b = 57)
shaded_normal(mean = 55, sd = 2, a = 51, b = 59)
shaded_normal(mean = 55, sd = 2, a = 49, b = 61)
We can also use the 68-95-99.7 Rule to calculate probabilities!
shaded_normal(mean = 55, sd = 5, a = 60)
(100-68)/2
## [1] 16
shaded_normal(mean = 55, sd = 5, b = 45)
(100-95)/2
## [1] 2.5
shaded_normal(mean = 55, sd = 5, b = 40)
(100-99.7)/2
## [1] 0.15
Speeds don’t always fall exactly in increments of 5pmh (the standard deviation) from the mean of 55mph. Though we can use other tools to calculate probabilities in these scenarios, the 68-95-99.7 Rule can still provide insight. For each scenario below, indicate which range the probability falls into: less than 0.0015, between 0.0015 and 0.025, between 0.025 and 0.16, or greater than 0.16.
shaded_normal(mean = 55, sd = 5, a = 57)
greater than 0.16
shaded_normal(mean = 55, sd = 5, a = 67)
between 0.0015 and 0.025
shaded_normal(mean = 55, sd = 5, a = 71)
less than 0.0015
Inherently important to all of our calculations above is how many standard deviations a value “X” is from the mean. This distance is called a Z-score and can be calculated as follows:
(X - mean) / sd
calc_z <- function(mu, sig, val) { (val - mu) / sig }
calc_z(55, 5, 60)
## [1] 1
About 35% of drivers are driving at a speed farther away from the limit.
calc_z(30, 3, 36)
## [1] 2
Only 2.5% of drivers are driving faster on these roads - this person is speeding more noticiably.
# Driver A
shaded_normal(mean = 55, sd = 5) +
geom_vline(xintercept = 60)
# Driver B
shaded_normal(mean = 30, sd = 3) +
geom_vline(xintercept = 36)
see above
The Normal exercises above are directly relevant to using our sample data to learning about the broader population of interest. Let’s see where these two ideas connect!
Recall that each student took a sample of 10 U.S. counties. From this sample, you each calculated the Trump’s mean 2020 support and modeled Trump’s 2020 results by Romney’s 2012 results. Import the resulting sample_mean, sample_intercept, and sample_slope values:
results <- read.csv("https://www.macalester.edu/~ajohns24/data/election_simulation.csv")
names(results) <- c("time", "sample_mean", "sample_intercept", "sample_slope")
sample_mean values. What’s the range in estimates? What’s the shape of the density plot (does it look Normal-ish)?ggplot(results, aes(x = sample_mean)) +
geom_density()
Probably around 66. It looks like that’s where the peak is.
elections %>%
summarize(mean(per_gop_2020))
## mean(per_gop_2020)
## 1 64.99124
Next, let’s examine how our estimates of the relationship between Trump’s 2020 support and Romney’s 2012 support varied from student to student.
sample_intercept and sample_slope values. (Do these look Normal-ish?)ggplot(results, aes(x = sample_intercept)) +
geom_density()
ggplot(results, aes(x = sample_slope)) +
geom_density()
ggplot(elections, aes(x = per_gop_2012, y = per_gop_2020)) +
geom_abline(data = results, aes(intercept = sample_intercept, slope = sample_slope), color = "gray") +
geom_smooth(method = "lm", color = "red", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
actual_trend <- lm(per_gop_2020 ~ per_gop_2012, elections)
summary(actual_trend)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.179393 0.486313057 10.65033 4.860116e-26
## per_gop_2012 1.000235 0.007896805 126.66328 0.000000e+00
Reflect upon the exercises above. Summarize what you’ve learned about using sample data to estimate features of a broader population. For example, how do sample estimates behave? How might we model sample estimates?
Visualizing the election results on an actual map can provide some intuition for our work. To make maps, load the following package. NOTE: You’ll likely need to install this package first.
library(socviz)
Now process the data to include mapping info (eg: latitude and longitude coordinates):
mapping_data <- elections %>%
rename(id = county_fips) %>%
mutate(id = as.character(id)) %>%
mutate(id = ifelse(nchar(id) == 4, paste0("0",id), id)) %>%
left_join(county_map, elections, by = "id")
Now make some maps!
ggplot(mapping_data, aes(x = long, y = lat, fill = per_gop_2020, group = group)) +
coord_equal() +
geom_polygon(color = NA)
ggplot(mapping_data, aes(x = long, y = lat, fill = per_gop_2020, group = group)) +
coord_equal() +
geom_polygon(color = NA) +
scale_fill_gradientn(colours = c("blue", "purple", "red"))
mn <- mapping_data %>%
filter(state_name == "Minnesota")
ggplot(mn, aes(x = long, y = lat, fill = per_gop_2020, group = group)) +
coord_equal() +
geom_polygon(color = NA) +
scale_fill_gradientn(colours = c("blue", "purple", "red"), values = scales::rescale(seq(0, 100, by = 10)))
Play around!
per_gop_2020 and per_gop_2016 (ie. how did Trump’s support shift from 2016 to 2020?).mn <- mapping_data %>%
filter(state_name == "California")
ggplot(mn, aes(x = long, y = lat, fill = per_gop_2020, group = group)) +
coord_equal() +
geom_polygon(color = NA) +
scale_fill_gradientn(colours = c("blue", "purple", "red"), values = scales::rescale(seq(0, 100, by = 10)))
mn <- mapping_data %>%
filter(state_name == "Minnesota")
ggplot(mn, aes(x = long, y = lat, fill = per_gop_2012, group = group)) +
coord_equal() +
geom_polygon(color = NA) +
scale_fill_gradientn(colours = c("blue", "purple", "red"), values = scales::rescale(seq(0, 100, by = 10)))
mapping_data <- mapping_data %>% mutate(diff_20_16 = per_gop_2020 - per_gop_2016)
ggplot(mapping_data, aes(x = long, y = lat, fill = diff_20_16, group = group)) +
coord_equal() +
geom_polygon(color = NA) +
scale_fill_gradientn(colours = c("blue", "purple", "red"))